home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Software Vault: The Gold Collection
/
Software Vault - The Gold Collection (American Databankers) (1993).ISO
/
cdr49
/
strpp300.zip
/
REGEXP.CPP
< prev
next >
Wrap
C/C++ Source or Header
|
1993-04-11
|
16KB
|
666 lines
/* -------------------------------------------------------------------- */
/* String++ Version 3.00 04/10/93 */
/* */
/* Enhanced string class for Turbo C++/Borland C++. */
/* Copyright 1991-1993 by Carl W. Moreland */
/* */
/* Derived from code Copyright 1988, Jim Mischel. All rights reserved. */
/* */
/* regexp.cpp */
/* -------------------------------------------------------------------- */
#include <stddef.h>
#include "regexp.h"
#define TRUE -1
#define FALSE 0
#define ALMOST 1 // returned when a closure matches a NULL
const char ENDSTR = '\0';
const char EOL = '$';
const char BOL = '^';
const char NEGATE = '^';
const char CCL = '[';
const char NCCL = ']';
const char CCLEND = ']';
const char ANY = '.';
const char DASH = '-';
const char OR = '|';
const char ESCAPE = '\\';
const char LPAREN = '(';
const char RPAREN = ')';
const char CLOSURE = '*';
const char POS_CLO = '+';
const char ZERO_ONE = '?';
const char LITCHAR = 'c';
const char END_TERM = 'e';
const String FS_DEFAULT = "[ \t]+";
int RSTART; // start of matched substring
int RLENGTH; // length of matched substring
int NF; // number of fields from most current split
RegExp FS(FS_DEFAULT); // global field separator
/* -------------------------------------------------------------------- */
RegExp::RegExp(const String& s)
{
reExpression = s;
if(reExpression() != "")
MakePattern();
}
void RegExp::operator=(const char* p)
{
reExpression = p;
if(reExpression() != "")
MakePattern();
}
void RegExp::operator=(const String& s)
{
reExpression = s;
if(reExpression() != "")
MakePattern();
}
/* -------------------------------------------------------------------- */
// MakePattern() - "compile" the regular expression into rePattern
const char* RegExp::MakePattern(void)
{
static String tmp;
if(ParseExpression(tmp) == "" || reExpression != ENDSTR)
{
rePattern = "";
reExpression.Reset();
return NULL;
}
rePattern = tmp;
reExpression.Reset();
return rePattern;
}
// ParseExpression() - Parse and translate an expression. Returns a pointer
// to the compiled expression, or NULL on error.
const char* RegExp::ParseExpression(String& expr)
{
String term;
expr = "";
if(ParseTerm(term) == "") // get the first term
return(NULL);
while(reExpression == OR) // parse all subsequent terms
{
expr += OR;
expr += term;
expr += END_TERM;
reExpression++;
if(ParseTerm(term) == "")
return (NULL);
}
expr += term;
expr += END_TERM;
return expr;
}
// ParseTerm() - parse and translate a term. Returns a pointer to the
// compiled term or NULL on error.
const char* RegExp::ParseTerm(String& term)
{
String factor;
term = "";
if(reExpression == BOL)
{
term += reExpression[0];
reExpression++;
}
do
{
if(ParseFactor(factor) == "")
return(NULL);
term += factor;
}
while(IsFactor()); // parse all factors of this term
return term;
}
// IsFactor() - returns TRUE for a valid factor character
int RegExp::IsFactor(void)
{
static char* nfac_chars = "^|)]+?*";
return (strchr(nfac_chars, reExpression[0]) == NULL) ? TRUE : FALSE;
}
// ParseFactor() - parse and translate a factor. Returns a pointer to the
// compiled factor or NULL on error.
const char* RegExp::ParseFactor(String& factor)
{
String tmp;
factor = "";
switch(reExpression[0])
{
case LPAREN: // ( - parenthesised expression
reExpression++;
ParseExpression(tmp);
factor += tmp;
if(reExpression != RPAREN)
return(NULL);
reExpression++;
break;
case CCL: // [ - character class; Ex: [0-9]
reExpression++;
ParseCCL(tmp);
factor += tmp;
if(reExpression != CCLEND)
return(NULL);
reExpression++;
break;
case ANY: // .
case EOL: // $
factor += reExpression[0];
reExpression++;
break;
case ESCAPE : // \
reExpression++;
factor += LITCHAR;
factor += ParseEscape();
reExpression++;
break;
case CLOSURE: // *
case POS_CLO: // +
case ZERO_ONE: // ?
case NEGATE: // ^
case CCLEND: // ]
case RPAREN: // )
case OR: // |
return(NULL); // not valid characters
default: // literal character
factor += LITCHAR;
factor += reExpression[0];
reExpression++;
break;
}
if(reExpression == CLOSURE || // check for closure
reExpression == ZERO_ONE ||
reExpression == POS_CLO)
{
if(ParseClosure(factor) == FALSE)
return(NULL);
}
return factor;
}
// ParseEscape() - returns ASCII value of character(s) following ESCAPE
char RegExp::ParseEscape(void)
{
static unsigned char ch;
switch(reExpression[0])
{
case 'b': reExpression++; return ('\b'); // backspace
case 't': reExpression++; return ('\t'); // tab
case 'f': reExpression++; return ('\f'); // formfeed
case 'n': reExpression++; return ('\n'); // linefeed
case 'r': reExpression++; return ('\r'); // carriage return
case '0': // 0-7 is octal constant
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
ch = reExpression[0] - '0';
reExpression++;
if(reExpression >= '0' && reExpression < '8')
{
ch <<= 3;
ch += reExpression[0] - '0';
reExpression++;
}
if(reExpression >= '0' && reExpression < '8')
{
ch <<= 3;
ch += reExpression[0] - '0';
reExpression++;
}
return(ch);
}
default: // otherwise, just that char
return(reExpression[0]);
}
}
// ParseClosure() - place closure character and size before the factor
// in the compiled string.
int RegExp::ParseClosure(String& factor)
{
if(factor.Len() > 253)
return FALSE; // closure expression too large
factor.Insert(0, " ");
factor[0] = reExpression[0];
factor[1] = (char)(factor.Len()-2);
reExpression++;
return TRUE;
}
// ParseCCL() - parse and translate a character class. Return pointer to the
// compiled class or NULL on error.
const char* RegExp::ParseCCL(String& ccl)
{
int first = TRUE;
int len;
ccl = "[ ";
if(reExpression == NEGATE) // if first character is NEGATE (^)
{
ccl[0] = NCCL; // then we have a negated
reExpression++; // character class
}
// parse all characters up to the closing bracket or end of string marker
while(reExpression != CCLEND && reExpression != ENDSTR)
{
if(reExpression == DASH && first == FALSE) // DASH, check for range
{
reExpression++;
if(reExpression == NCCL)
ccl += DASH; // not range, literal DASH
else
{
ParseDASH(ccl, reExpression);
reExpression++;
}
}
else
{
if(reExpression == ESCAPE)
{
reExpression++;
ccl += ParseEscape();
reExpression++;
}
else
{
ccl += reExpression[0];
reExpression++;
}
}
first = FALSE;
}
len = ccl.Len()-2;
if(len > 255)
return NULL; // character class too large
else
{
ccl[1] = (char)len; // store CCL length at ccl[1]
return ccl;
}
}
// ParseDASH() - fill in range characters.
const char* RegExp::ParseDASH(String& ccl, char ch)
{
static char c;
for(c = ccl[ccl.Len() - 1] + 1; c <= ch; c++)
ccl += c;
return ccl;
}
/* -------------------------------------------------------------------- */
// Match() - Return the position of the first character of the left-most
// longest substring of s that Matches pattern, or NULL if no match is
// found. Sets RSTART and RLENGTH.
int RegExp::Match(const char* s, const char* pattern)
{
char* c = (char*)s;
reLastChar = (char*)s;
while(*c != ENDSTR)
{
if(MatchTerm(int(c-(char*)s), c, (char*)pattern) != FALSE)
{
RSTART = int(c-(char*)s);
RLENGTH = int(reLastChar - c);
return RSTART;
}
c++;
}
RSTART = RLENGTH = 0;
return -1;
}
// MatchTerm() - Match a compiled term. Returns TRUE, FALSE, or ALMOST.
int RegExp::MatchTerm(int offset, char* s, char* pattern)
{
reLastChar = s;
if(*pattern == ENDSTR)
return FALSE;
do
{
switch(*pattern)
{
case BOL: // ^ - match beginning of line
if(offset != 0)
return FALSE;
pattern++;
break;
case LITCHAR: // c - match literal character
if(*s++ != *++pattern)
return FALSE;
pattern++;
break;
case END_TERM: // e - skip end-of-term character
pattern++;
break;
case ANY: // . - match any character
if(*s++ == ENDSTR) // except end of string
return FALSE;
pattern++;
break;
case OR:
return MatchOR(offset, s, pattern);
case CCL: // [ - character class
case NCCL: // ]
if(*s == ENDSTR)
return FALSE;
if(!MatchCCL(*s++, pattern++))
return FALSE;
pattern += pattern[0] + 1;
break;
case EOL: // $ - match end of string
if(*s != ENDSTR)
return FALSE;
pattern++;
break;
case ZERO_ONE: // ?
return Match_0_1(offset, s, pattern);
case CLOSURE: // *
case POS_CLO: // +
{
char ClosurePattern[1024];
strncpy(ClosurePattern, pattern+2, *(pattern+1));
ClosurePattern[*(pattern+1)] = ENDSTR;
return MatchClosure(offset, s, pattern, ClosurePattern);
}
default:
// If we get to this point, then something has gone very wrong.
// Most likely, someone has tried to match with an invalid
// compiled pattern.
return FALSE;
}
reLastChar = s;
} while(*pattern != ENDSTR);
return TRUE;
}
// MatchOR() - Handles selection processing.
int RegExp::MatchOR(int offset, char* s, char* pattern)
{
char tmp_pattern[1024];
char *t1, *t2, *junk;
// The first case is build into tmp_pattern. Second case is already there.
// Both patterns are searched to determine the longest matched substring.
tmp_pattern[0] = ENDSTR;
pattern++;
junk = (char*)SkipTerm(pattern);
strncat(tmp_pattern, pattern, int(junk-pattern));
strcat(tmp_pattern, SkipTerm(junk));
t1 = (MatchTerm(offset, s, tmp_pattern) != FALSE) ? reLastChar : NULL;
// The second pattern need not be searched if the first pattern results
// in a match through to the end of the string, since the longest possible
// match has already been found.
if(t1 == NULL || *reLastChar != ENDSTR)
{
t2 = (MatchTerm(offset, s, junk) != FALSE) ? reLastChar : NULL;
// determine which matched the longest substring
if(t1 != NULL && (t2 == NULL || t1 > t2))
reLastChar = t1;
}
return (t1 == NULL && t2 == NULL) ? FALSE : TRUE;
}
// SkipTerm() - Skip over the current term and return a pointer to the
// next term in the pattern.
const char* RegExp::SkipTerm(char* pattern)
{
register int nterm = 1;
while(nterm > 0)
{
switch(*pattern)
{
case OR:
nterm++;
break;
case CCL:
case NCCL:
case CLOSURE:
case ZERO_ONE:
case POS_CLO:
pattern++;
pattern += pattern[0];
break;
case END_TERM:
nterm--;
break;
case LITCHAR:
pattern++;
break;
}
pattern++;
}
return pattern;
}
// Match_0_1() - Match the ZERO_ONE operator. First, this routine attempts
// to match the entire pattern with the input string. If that fails, it
// skips over the closure pattern and attempts to match the rest of the
// pattern.
int RegExp::Match_0_1(int offset, char* s, char* pattern)
{
char* old_s = s;
if(MatchTerm(offset, s, pattern+2) == TRUE)
return TRUE;
else if(MatchTerm(offset, old_s, pattern+2+*(pattern+1)) == FALSE)
return FALSE;
else
return ALMOST;
}
// MatchClosure() - Match CLOSURE and POS_CLO. Match as many of the
// closure patterns as possible, then attempt to match the remaining
// pattern with what's left of the input string. Backtrack until we've
// either matched the remaing pattern or we arrive back at where we
// started.
int RegExp::MatchClosure(int offset, char* s,
char* pattern, char* ClosurePattern)
{
char* old_s = s;
if(MatchTerm(offset, s, ClosurePattern) == TRUE)
{
old_s = reLastChar;
if(MatchClosure(offset, old_s, pattern, ClosurePattern) != FALSE)
return TRUE;
else
return MatchTerm(offset, old_s, pattern+2+*(pattern+1));
}
else if(*pattern != CLOSURE) // POS_CLO requires at least one match
return FALSE;
else if(MatchTerm(offset, old_s, pattern+2+*(pattern+1)) == TRUE)
return ALMOST;
else
return FALSE;
}
// MatchCCL() - Match a character class or negated character class
int RegExp::MatchCCL(char c, char* pattern)
{
register int x;
char ccl = *pattern++;
for(x = *pattern; x > 0; x--)
{
if(c == pattern[x])
return(ccl == CCL);
}
return(ccl != CCL);
}
/* -------------------------------------------------------------------- */
// Return the position of the first instance of t in s, or -1 if no match.
int index(const String& s, const RegExp& t)
{
return match(s, (RegExp)t);
}
// sub() - Substitute 'to' for the leftmost longest substring of str
// matched by the regular expression 'from'. Return number of substitutions
// made.
int sub(RegExp& from, const String& to, String& str)
{
if(match(str, from) == -1)
return 0;
str.Replace(RSTART, RLENGTH, to);
return 1;
}
// gsub() - Substitute 'to' globally for all substrings in str matched by
// the regular expression 'from'. Return number of substitutions made.
int gsub(RegExp& from, const String& to, String& str, unsigned count)
{
int n = 0;
ParseString tmp = str;
while(match(tmp, from) != -1)
{
tmp.Replace(RSTART, RLENGTH, to);
tmp += RSTART+RLENGTH;
if(++n == count)
break;
}
tmp.Reset();
str = tmp;
return n;
}
// split() - split the string s into fields in the array a on field
// separator fs. Returns number of fields. Also sets the global variable
// NF.
int split(const String& s, String*& array, const RegExp& fs)
{
String *tmp;
ParseString ps = s;
int rstart = RSTART; // save RSTART and RLENGTH
int rlength = RLENGTH;
NF = 1;
while(match(ps, (RegExp)fs) != -1)
{
ps += RSTART+RLENGTH;
NF++;
}
tmp = new String[NF];
ps.Reset();
NF = 0;
while(match(ps, (RegExp)fs) != -1)
{
tmp[NF++] = ps(0, RSTART);
ps += RSTART+RLENGTH;
}
array = tmp;
RSTART = rstart; // restore globals
RLENGTH = rlength;
return (NF);
}
ostream& operator<<(ostream& strm, const RegExp& re)
{
return strm << re.reExpression();
}